0. Requirements
Note: Please load the workspace in the directory
implementation/R/workspace/preprocessing.RData to run the
following code and re-use the previously created variables. Furthermore,
the following libraries must be installed and loaded:
# intall necessary packages
#install.packages("quanteda")
#install.packages("readtext")
#install.packages("tidyverse")
#install.packages("quanteda.textstats")
#install.packages("quanteda.textplots")
#install.packages("data.table")
#install.packages("stringr")
#install.packages("spacyr")
#install.packages("textcat")
#install.packages("plyr")
# load libraries
library(quanteda)
library(readtext)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.6 ✓ dplyr 1.0.8
✓ tidyr 1.2.0 ✓ stringr 1.4.0
✓ readr 2.1.2 ✓ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(quanteda.textplots)
library(quanteda.textstats)
library(plyr)
--------------------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
--------------------------------------------------------------------------------------------
Attache Paket: ‘plyr’
Die folgenden Objekte sind maskiert von ‘package:dplyr’:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
Das folgende Objekt ist maskiert ‘package:purrr’:
compact
library(dplyr)
#library(stringr)
#library(data.table)
#library(textcat)
1. Collocations
To retrieve the direct collocations, i.e. one token to the left or
right of the compound word, we use the kwic function
offered by quanteda. Here we can choose a window of 1 to
make sure we obtain the correct number of collocations.
1.1 First Look
Let’s have a first look at the collocations for the example
“Klimaleugner” (en: “climate denier”). We are going to retrieve the
collocations to the left (pre) and to the right
(post) of the key word and count their occurrences. Then,
we will output the Top-5 collocations for each category,
i.e. pre and post.
# apply keyword-in-context function for given word
word = "klimaleugner"
# to C2022
kwic_con <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
# to P2022
kwic_pro <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
Let’s show the Top-5 for the C2022 corpus:
kwic_con %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
kwic_con %>%
dplyr:: count(post) %>%
arrange(desc(n)) %>%
head(n=5)
And the Top-5 for the P2022 corpus:
kwic_pro %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
kwic_pro %>%
dplyr::count(post) %>%
arrange(desc(n)) %>%
head(n=5)
1.2 Apply to all Glossary Terms
Now we seek to create tables that contain the top 5 pre
and post collocations for each of our compound words.
Firstly, we create a table for the collocations we can obtain from
P2022
# for each compound, get list of top 5 collocations
# initiate empty data frame
pro_colls10 = data.frame()
# for each compound
for (word in compounds){
# get collocations
kwic_pro <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
keyword <- word
# retrieve top5 preceding collocations
pro_pre <- kwic_pro %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
# retrieve top5 following collocations
pro_post <- kwic_pro %>%
dplyr::count(post) %>%
arrange(desc(n)) %>%
head(n=5)
# normalize data frames with top5 collocations
pro_pre$keyword <- keyword
pro_pre$tag <- "pre"
names(pro_pre)[names(pro_pre) == 'pre'] <- "word"
pro_post$keyword <- keyword
pro_post$tag <- "post"
names(pro_post)[names(pro_post) == 'post'] <- "word"
pro_colls10 <- rbind(pro_colls10, pro_pre)
pro_colls10 <- rbind(pro_colls10, pro_post)}
Most of the collocations only occur exactly once. Since this is not
very informative for us, we remove all the collocations with a count of
exactly 1. Also, we want to remove noise, i.e. empty strings from the
collocations.
# only keep collocations that appear more than once
top_colls_pro<-pro_colls10[(pro_colls10$n > 1),]
# remove empty strings
top_colls_pro<-top_colls_pro[(top_colls_pro$word > " "),]
And save the table to a csv file.
#write.csv(top_colls_pro, "../output/top_collocations_pro.csv")
Then, we create the same table of the top 5 pre and
post collocations for the C2022.
# for each compound, get list of top 5 collocations
# initiate empty data frame
con_colls10 = data.frame()
# for each compound
for (word in compounds){
# get collocations
kwic_con <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
#keyword <- kwic_pro$keyword[[1]]
keyword <- word
# retrieve top5 preceding collocations
con_pre <- kwic_con %>%
dplyr::count(pre) %>%
arrange(desc(n)) %>%
head(n=5)
# retrieve top5 following collocations
con_post <- kwic_con %>%
dplyr::count(post) %>%
arrange(desc(n)) %>%
head(n=5)
# normalize data frames with top5 collocations
con_pre$keyword <- keyword
con_pre$tag <- "pre"
names(con_pre)[names(con_pre) == 'pre'] <- "word"
con_post$keyword <- keyword
con_post$tag <- "post"
names(con_post)[names(con_post) == 'post'] <- "word"
con_colls10 <- rbind(con_colls10, con_pre)
con_colls10 <- rbind(con_colls10, con_post)}
And, just like before, we remove the collocations that appeared only
once in the corpus (and remove noise, i.e. empty strings from the
collocations).
# only keep collocations that appear more than once
top_colls_con<-con_colls10[(con_colls10$n > 1),]
# remove empty strings
top_colls_con<-top_colls_con[(top_colls_con$word > " "),]
And save the final table to a csv file.
write.csv(top_colls_con, "../output/top_collocations_con1.csv")
2. Concordances (KWIC)
To retrieve the context of each compound word, we extract the
concordances on a sentence level. That means, we extract a window of 5
sentences to the left and to the right of the keyword sentence. To do
this, we must tokenize our data by sentences, instead of words.
2.1 Preprocessing
Since we cannot normalize the data the same way when we tokenize it
on sentence-level, we firstly create word tokens from the corpora.
# create word tokens for P2022 and C2022
p2022_tokens <- tokens(pro2022, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE)
c2022_tokens <- tokens(contra2022, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE)
To these tokens, we apply a normalization step where we remove
hyphens within words, such as “Klima-Skeptiker” to convert it to
“Klimaskeptiker”.
# remove hyphens from tokens
# convert to tokens
#p2022_toks_cleaned <- as.tokens(p2022_tokens)
# replace multi-token sequences with a "compound" token
#toks_comp <- tokens_compound(p2022_toks_cleaned, phrase("*-*"), concatenator ="")
toks_comp_p <- tokens_compound(p2022_tokens, phrase("*-*"), concatenator ="")
# get tokens containing the hyphen
toks_hyphenated_p <- grep("\\w+-\\w+", types(toks_comp_p), value = TRUE)
# replace the hyphenated tokens by versions without hyphen
p2022_toks_cleaned <- tokens_replace(toks_comp_p, toks_hyphenated_p, gsub("-", "", toks_hyphenated_p))
# convert to tokens
#c2022_toks_cleaned <- as.tokens(c2022_tokens)
#toks_comp <- tokens_compound(c2022_toks_cleaned, phrase("*-*"), concatenator ="")
toks_comp_c <- tokens_compound(c2022_tokens, phrase("*-*"), concatenator ="")
# get tokens containing the hyphen
toks_hyphenated_c <- grep("\\w+-\\w+", types(toks_comp_c), value = TRUE)
# replace the hyphenated tokens by versions without hyphen
c2022_toks_cleaned <- tokens_replace(toks_comp_c, toks_hyphenated_c, gsub("-", "", toks_hyphenated_c))
# merge tokens back into corpus object
p2022_merged_toks <- corpus(sapply(p2022_toks_cleaned, paste, collapse = " "))
c2022_merged_toks <- corpus(sapply(c2022_toks_cleaned, paste, collapse = " "))
Now we can create sentence tokens for both corpora.
# create "sentence" tokens for P2022 and C2022 corpus
p2022_sentences <- tokens(p2022_merged_toks, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE,
what = "sentence")
c2022_sentences <- tokens(c2022_merged_toks, remove_punct = FALSE, remove_symbols = TRUE,
remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE,
what = "sentence")
2.2 Key Word In Context Retrieval
# create a data frame from tokens containing 5 sentences before and after the keyword
### DO FOR p2022 ####
kwic_pro_sent.df <- data.frame(matrix(ncol = 7, nrow = 0))
kwiclist_sent_pro <- list()
# for each compound word
for (word in compounds)
{
# retrieve sentences before/after keyword
context_pro_sent <- kwic(p2022_sentences, word, valuetype="regex", window=5)
kwiclist_sent_pro[[word]] <- context_pro_sent # save to list
}
kwic_pro_sent.df = do.call(rbind, kwiclist_sent_pro) # save to final data frame
### DO FOR c2022 ###
kwic_con_sent.df <- data.frame(matrix(ncol = 7, nrow = 0))
kwiclist_sent_con <- list()
# for each compound word
for (word in compounds)
{
# retrieve sentences before/after keyword
context_con_sent <- kwic(c2022_sentences, word, valuetype="regex", window=5)
kwiclist_sent_con[[word]] <- context_con_sent # save to list
}
kwic_con_sent.df = do.call(rbind, kwiclist_sent_con) # save to final data frame
kwic_pro_sent.df
Keyword-in-context with 803 matches.
kwic_con_sent.df
Keyword-in-context with 1,946 matches.
[ reached max_nrow ... 946 more matches ]
2.2 Export Concordances
# save to csv file
write.csv(kwic_pro_sent.df,"../output/pro_context_new.csv", row.names = FALSE)
write.csv(kwic_con_sent.df,"../output/con_context_new.csv", row.names = FALSE)
3. Term Frequencies
Additionally, we compute the term frequencies of each compound word
and the according TF-IDF score, since both corpora have a different size
and we want to explore the relevance of each term.
Create a function to normalize the TF-IDF scores
# min/max normalization from -1 to 1, relative to data frame results
normalize <- function(x, na.rm = TRUE){
return((x - min(x)) / (max(x)-min(x)))}
p2022_cleaned <- corpus(sapply(sp_p2022_tokens, paste, collapse = " "))
c2022_cleaned <- corpus(sapply(sp_c2022_tokens, paste, collapse = " "))
p2022_cleaned$group <- "activists"
c2022_cleaned$group <- "skeptics"
complete = p2022_cleaned+c2022_cleaned
# create dfm with frequencies per group
dfm_complete_freq <- dfm(complete) %>%
dfm_keep(pattern = compounds) %>% # only keep compound words
dfm_group(groups = group) # keep groups "activists" and "skeptics"
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
# convert dfm to data frame
dfm_complete_df <- dfm_complete_freq %>%
convert(to = "data.frame") %>%
t() # transpose data frame
set.seed(132) # set seed for reproducibility
textplot_wordcloud(dfm_complete_freq, comparison = TRUE, max_words = 250) # plot wordcloud

write.csv(dfm_complete_df,"/Users/anna/Documents/uni/thesis/implementation/R/output/tf_complete1.csv", row.names = TRUE)
And compute TF-IDF of the DFMs
### FOR C2022
# create dfm of lemmatized tokens, only keep compound words
#dfm_c2022 <- dfm(sp_c2022_tokens) %>% dfm_keep(pattern = compounds)
#dfm_c2022_tfidf <- dfm_tfidf(dfm_c2022) # compute tfidf scores
#top_c2022_norm <- normalize(topfeatures(dfm_c2022, n=300)) # normalize scores
dfm_c2022 <- dfm(sp_c2022_tokens) %>%
dfm_tfidf() %>%
dfm_keep(pattern = compounds)
c2022_tfidf <- normalize(topfeatures(dfm_c2022, n=300))
# dfm_keep(pattern = compounds)
#dfm_c2022_tfidf <- dfm_tfidf(dfm_c2022) # compute tfidf scores
#top_c2022_norm <- normalize(topfeatures(dfm_c2022, n=300))
# convert dfm into data frame
top_c2022_norm <- data.frame(Term = names(c2022_tfidf), Freq = c2022_tfidf, row.names = NULL) %>%
dplyr::arrange(desc(Freq))
### FOR P2022
# create dfm of lemmatized tokens, only keep compound words
#dfm_p2022 <- dfm(sp_p2022_tokens) %>% dfm_keep(pattern = compounds)
#dfm_p2022_tfidf <- dfm_tfidf(dfm_p2022) # compute tfidf scores
#top_p2022_norm <- normalize(topfeatures(dfm_p2022, n=300)) # normalize scores
dfm_p2022 <- dfm(sp_p2022_tokens) %>%
dfm_tfidf() %>%
dfm_keep(pattern = compounds)
p2022_tfidf <- normalize(topfeatures(dfm_p2022, n=300))
# convert dfm into data frame
top_p2022_norm <- data.frame(Term = names(p2022_tfidf), Freq = p2022_tfidf, row.names = NULL) %>%
dplyr::arrange(desc(Freq))
# change column names to be able to merge both data frames
colnames(top_p2022_norm)[2] <- "Freq_P2022"
colnames(top_c2022_norm)[2] <- "Freq_C2022"
# merge data frames
df_merge <- merge(top_c2022_norm,top_p2022_norm,by="Term", all.x = TRUE, all.y = TRUE)
# write to csv file
write.csv(df_merge,"/Users/anna/Documents/uni/thesis/implementation/R/output/tfidf_complete.csv", row.names = TRUE)
Plot TF-IDF Scores
# retrieve frequencies for "klima" words
freqs_pro <- textstat_frequency(dfm_p2022, force=TRUE)
freqs_con <- textstat_frequency(dfm_c2022, force=TRUE)
# apply normalization
#freqs_pro$normalize = round(normalize(freqs_pro$frequency), 3)
#freqs_con$normalize = round(normalize(freqs_con$frequency), 3)
# retrieve only words that are contained in our final compound list
#freqs_pro_subset <- freqs_pro[freqs_pro$feature %in% compounds, ]
#freqs_pro_subset$feature <- str_to_title(freqs_pro_subset$feature)
#freqs_con_subset <- freqs_con[freqs_con$feature %in% compounds, ]
#freqs_con_subset$feature <- str_to_title(freqs_con_subset$feature)
freqs_pro$feature <- str_to_title(freqs_pro$feature)
freqs_con$feature <- str_to_title(freqs_con$feature)
freqs_pro$normalize = round(normalize(freqs_pro$frequency),3)
freqs_con$normalize = round(normalize(freqs_con$frequency),3)
freqs.act <- filter(freqs_pro) %>% as.data.frame() %>% select(feature, normalize)
freqs.scept <- filter(freqs_con) %>% as.data.frame() %>% select(feature, normalize)
freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(50) %>% arrange(normalize.x) %>% mutate(feature = factor(feature, feature))
#freqs_pro_subset$normalize = round(normalize(freqs_pro_subset$frequency), 3)
#freqs_con_subset$normalize = round(normalize(freqs_con_subset$frequency), 3)
# plot comparison of both groups -> only words from compound list
#freqs.act <- filter(freqs_pro_subset) %>% as.data.frame() %>% select(feature, normalize)
#freqs.scept <- filter(freqs_con_subset) %>% as.data.frame() %>% select(feature, normalize)
#freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(30) %>% arrange(normalize.x) %>% #mutate(feature = factor(feature, feature))
# create plot
plot8 <- ggplot(freqs) +
geom_segment(aes(x=feature, xend=feature, y=normalize.x, yend=normalize.y), color="grey") +
geom_point(aes(x=feature, y=normalize.x, colour="Activists"), size = 3) +
geom_point(aes(x=feature, y=normalize.y, colour="Sceptics"), size = 3) +
ggtitle("Comparison 'Klima' TF-IDF Scores per Group") +
xlab("") + ylab("TF-IDF") +
coord_flip()
plot8+labs(colour="Group")
Warnung: Removed 20 rows containing missing values (geom_segment).
Warnung: Removed 20 rows containing missing values (geom_point).
#ggsave("/Users/anna/Documents/uni/thesis/plots/comparison_klima_freqs_lemma.png", dpi=300, dev='png', height=6, width=12, units="in")
ggsave("/Users/anna/Documents/uni/thesis/plots/comparison_tfidf.png", dpi=300, dev='png', height=10, width=15, units="in")
Warnung: Removed 20 rows containing missing values (geom_segment).
Warnung: Removed 20 rows containing missing values (geom_point).

TO DELETE!!!!
Plot TF-IDF Scores
# create a sample of the dfm with all words starting with "klima..."
#klima_p2000 <- dfm_select(dfm_p2000_lemma, pattern="klima*")
#klima_c2000 <- dfm_select(dfm_c2000_lemma, pattern="klima*")
c2022_dfm <- dfm(sp_c2022_tokens)
c2022_tfidf <- dfm_tfidf(c2022_dfm)
p2022_dfm <- dfm(sp_p2022_tokens)
p2022_tfidf <- dfm_tfidf(p2022_dfm)
#dfm_c2022 <- dfm(sp_c2022_tokens) %>% dfm_keep(pattern = compounds)
#dfm_c2022_tfidf <- dfm_tfidf(dfm_c2022) # compute tfidf scores
#top_c2022_norm <- normalize(topfeatures(dfm_c2022, n=300))
# calculate tfidf for "klima" words
#p2000_tfidf <- dfm_tfidf(klima_p2000, scheme_tf = "prop", scheme_df = "inverse")
#c2000_tfidf <- dfm_tfidf(klima_c2000, scheme_tf = "prop", scheme_df = "inverse")
# retrieve frequencies for "klima" words
freqs_pro <- textstat_frequency(p2022_tfidf, force=TRUE)
freqs_con <- textstat_frequency(c2022_tfidf, force=TRUE)
# apply normalization
#freqs_pro$normalize = round(normalize(freqs_pro$frequency), 3)
#freqs_con$normalize = round(normalize(freqs_con$frequency), 3)
# retrieve only words that are contained in our final compound list
freqs_pro_subset <- freqs_pro[freqs_pro$feature %in% compounds, ]
freqs_pro_subset$feature <- str_to_title(freqs_pro_subset$feature)
freqs_con_subset <- freqs_con[freqs_con$feature %in% compounds, ]
freqs_con_subset$feature <- str_to_title(freqs_con_subset$feature)
freqs_pro_subset$normalize = round(normalize(freqs_pro_subset$frequency), 3)
freqs_con_subset$normalize = round(normalize(freqs_con_subset$frequency), 3)
# plot comparison of both groups -> only words from compound list
freqs.act <- filter(freqs_pro_subset) %>% as.data.frame() %>% select(feature, normalize)
freqs.scept <- filter(freqs_con_subset) %>% as.data.frame() %>% select(feature, normalize)
freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(30) %>% arrange(normalize.x) %>% mutate(feature = factor(feature, feature))
# create plot
plot8 <- ggplot(freqs) +
geom_segment(aes(x=feature, xend=feature, y=normalize.x, yend=normalize.y), color="grey") +
geom_point(aes(x=feature, y=normalize.x, colour="Activists"), size = 3) +
geom_point(aes(x=feature, y=normalize.y, colour="Sceptics"), size = 3) +
ggtitle("Comparison 'Klima' TF-IDF Scores per Group") +
xlab("") + ylab("TF-IDF") +
coord_flip()
plot8+labs(colour="Group")
Warnung: Removed 8 rows containing missing values (geom_segment).
Warnung: Removed 8 rows containing missing values (geom_point).
ggsave("/Users/anna/Documents/uni/thesis/plots/comparison_klima_freqs_lemma.png", dpi=300, dev='png', height=6, width=12, units="in")
Warnung: Removed 8 rows containing missing values (geom_segment).
Warnung: Removed 8 rows containing missing values (geom_point).

TO DELETE
kwic(c2022_merged_toks, pattern="klimaanbeter", window=1, valuetype="regex")
kwic(c2022_sentences, pattern="klimabank", window=1, valuetype="regex")
1.2 Apply to all Glossary Terms
Now, we will retrieve the collocations for each compound word. The
collocations are then saved to a data frame, one for each corpus, and
exported to a csv file, such that we can also use the data in Python
### C2022
# initiate empty data frame for C2022
collocations_con = data.frame(docname=character(),
from=integer(),
to=integer(),
pre=logical(),
keyword=character(),
post=character(),
pattern=factor())
# for each compound word
for (word in compounds){
# initiate empty data frame
colls = data.frame()
# look up collocations
colls <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
# save to data frame
collocations_con <- rbind(collocations_con, colls)}
### P2022
# initiate empty data frame for P2022
collocations_pro = data.frame(docname=character(),
from=integer(),
to=integer(),
pre=logical(),
keyword=character(),
post=character(),
pattern=factor())
# for each compound
for (word in compounds){
# initiate empty data frame
colls = data.frame()
# look up collocations
colls <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
as_tibble()
# save to data frame
collocations_pro <- rbind(collocations_pro, colls)}
Please run the following lines to save the output to a csv file.
#write.csv(collocations_con, "../output/collocations_con.csv")
#write.csv(collocations_pro, "../output/collocations_pro.csv")
TO REPLACE COMPOUND FORMS BY THEIR LEMMA
# for each compound
sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimaglaubenslehr", replacement="klimaglaubenslehre", valuetype = "fixed")
sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimakarawan", replacement="klimakarawane", valuetype = "fixed")
sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimazeug", replacement="klimazeugs", valuetype = "fixed")
sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimawendehal", replacement="klimawendehals", valuetype = "fixed")
sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimaglaubenslehr", replacement="klimaglaubenslehre", valuetype = "fixed")
sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimakarawan", replacement="klimakarawane", valuetype = "fixed")
sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimazeug", replacement="klimazeugs", valuetype = "fixed")
sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimawendehal", replacement="klimawendehals", valuetype = "fixed")
for (word_form in ["glaubenslehre"]){
word = c(unlist_forms(word_form)) # turn into correct format
original <- compound_df[compound_df$compound_forms %like% word[[1]], ]$original[[1]]
lemma <- rep(original, length(word))
# replace string in tokens with lemma form (for pro2000 and contra2000)
pro2000_tokens <- tokens_replace(pro2000_tokens, word, lemma, valuetype = "fixed")
contra2000_tokens <- tokens_replace(contra2000_tokens, word, lemma, valuetype = "fixed")}
# function to preprocess compounds data frame
# this function unnests the list of word forms for each compound and creates a list containing all potential word forms
unlist_forms = function(word){
x <- unlist(strsplit(word, ","))
return(gsub(" ","",x))}
# apply the function to our compounds data frame
compound_forms <- unlist_forms(compound_df$compound_forms)
#for (word in compound_forms){
# print(word)
#}
compound_forms
---
title: "Thesis: Corpus-Based Methods"
output: html_notebook
---

# 0. Requirements
Note: Please load the workspace in the directory `implementation/R/workspace/preprocessing.RData` to run the following code and re-use the previously created variables. Furthermore, the following libraries must be installed and loaded:
```{r message=FALSE, warning=FALSE}
# intall necessary packages
#install.packages("quanteda")
#install.packages("readtext")
#install.packages("tidyverse")
#install.packages("quanteda.textstats")
#install.packages("quanteda.textplots")
#install.packages("data.table")
#install.packages("stringr")
#install.packages("spacyr")
#install.packages("textcat")
#install.packages("plyr")

# load libraries
library(quanteda)
library(readtext)
library(tidyverse)
library(quanteda.textplots)
library(quanteda.textstats)
library(plyr)
library(dplyr)
#library(stringr)
#library(data.table)
#library(textcat)
```

# 1. Collocations
To retrieve the direct collocations, i.e. one token to the left or right of the compound word, we use the `kwic` function offered by `quanteda`. Here we can choose a window of 1 to make sure we obtain the correct number of collocations.

## 1.1 First Look
Let's have a first look at the collocations for the example "Klimaleugner" (en: "climate denier"). We are going to retrieve the collocations to the left (`pre`) and to the right (`post`) of the key word and count their occurrences. Then, we will output the Top-5 collocations for each category, i.e. `pre` and `post`. 
```{r}
# apply keyword-in-context function for given word
word = "klimaleugner"

# to C2022
kwic_con <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
  as_tibble()

# to P2022
kwic_pro <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
  as_tibble()
```


Let's show the Top-5 for the C2022 corpus:
```{r}
kwic_con %>%
  dplyr::count(pre) %>%
  arrange(desc(n)) %>%
  head(n=5)

kwic_con %>%
  dplyr:: count(post) %>%
  arrange(desc(n)) %>%
  head(n=5) 
```

And the Top-5 for the P2022 corpus:
```{r}
kwic_pro %>%
 dplyr::count(pre) %>%
  arrange(desc(n)) %>%
  head(n=5)

kwic_pro %>%
 dplyr::count(post) %>%
  arrange(desc(n)) %>%
  head(n=5)
```

## 1.2 Apply to all Glossary Terms
Now we seek to create tables that contain the top 5 `pre` and `post` collocations for each of our compound words. 
Firstly, we create a table for the collocations we can obtain from P2022
```{r}
# for each compound, get list of top 5 collocations
# initiate empty data frame 
pro_colls10 = data.frame()

# for each compound
for (word in compounds){

  # get collocations
  kwic_pro <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
    as_tibble()
  keyword <- word 
  
  # retrieve top5 preceding collocations
  pro_pre <- kwic_pro %>%
    dplyr::count(pre) %>%
    arrange(desc(n)) %>%
    head(n=5)

  # retrieve top5 following collocations
  pro_post <- kwic_pro %>%
    dplyr::count(post) %>%
    arrange(desc(n)) %>%
    head(n=5)

  # normalize data frames with top5 collocations 
  pro_pre$keyword <- keyword
  pro_pre$tag <- "pre"
  names(pro_pre)[names(pro_pre) == 'pre'] <- "word"

  pro_post$keyword <- keyword
  pro_post$tag <- "post"
  names(pro_post)[names(pro_post) == 'post'] <- "word"
  
  pro_colls10 <- rbind(pro_colls10, pro_pre)
  pro_colls10 <- rbind(pro_colls10, pro_post)}
```

Most of the collocations only occur exactly once. Since this is not very informative for us, we remove all the collocations with a count of exactly 1. Also, we want to remove noise, i.e. empty strings from the collocations. 
```{r}
# only keep collocations that appear more than once 
top_colls_pro<-pro_colls10[(pro_colls10$n > 1),]

# remove empty strings 
top_colls_pro<-top_colls_pro[(top_colls_pro$word > " "),]
```

And save the table to a csv file.
```{r}
#write.csv(top_colls_pro, "../output/top_collocations_pro.csv")
```

Then, we create the same table of the top 5 `pre` and `post` collocations for the C2022.
```{r}
# for each compound, get list of top 5 collocations
# initiate empty data frame 
con_colls10 = data.frame()

# for each compound
for (word in compounds){

  # get collocations
  kwic_con <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
    as_tibble()
  #keyword <- kwic_pro$keyword[[1]]
  keyword <- word 
  
  # retrieve top5 preceding collocations
  con_pre <- kwic_con %>%
    dplyr::count(pre) %>%
    arrange(desc(n)) %>%
    head(n=5)

  # retrieve top5 following collocations
  con_post <- kwic_con %>%
    dplyr::count(post) %>%
    arrange(desc(n)) %>%
    head(n=5)

  # normalize data frames with top5 collocations 
  con_pre$keyword <- keyword
  con_pre$tag <- "pre"
  names(con_pre)[names(con_pre) == 'pre'] <- "word"

  con_post$keyword <- keyword
  con_post$tag <- "post"
  names(con_post)[names(con_post) == 'post'] <- "word"
  
  con_colls10 <- rbind(con_colls10, con_pre)
  con_colls10 <- rbind(con_colls10, con_post)}
```

And, just like before, we remove the collocations that appeared only once in the corpus (and remove noise, i.e. empty strings from the collocations).
```{r}
# only keep collocations that appear more than once 
top_colls_con<-con_colls10[(con_colls10$n > 1),]
# remove empty strings 
top_colls_con<-top_colls_con[(top_colls_con$word > " "),]
```

And save the final table to a csv file.
```{r}
#write.csv(top_colls_con, "../output/top_collocations_con.csv")
```

# 2. Concordances (KWIC)
To retrieve the context of each compound word, we extract the concordances on a sentence level. That means, we extract a window of 5 sentences to the left and to the right of the keyword sentence. To do this, we must tokenize our data by sentences, instead of words.

## 2.1 Preprocessing 
Since we cannot normalize the data the same way when we tokenize it on sentence-level, we firstly create word tokens from the corpora.
```{r}
# create word tokens for P2022 and C2022
p2022_tokens <- tokens(pro2022, remove_punct = FALSE, remove_symbols = TRUE,
                  remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE)

c2022_tokens <- tokens(contra2022, remove_punct = FALSE, remove_symbols = TRUE,
                  remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE)
```


To these tokens, we apply a normalization step where we remove hyphens within words, such as "Klima-Skeptiker" to convert it to "Klimaskeptiker". 
```{r}
# remove hyphens from tokens

# convert to tokens
#p2022_toks_cleaned <- as.tokens(p2022_tokens)

# replace multi-token sequences with a "compound" token 
#toks_comp <- tokens_compound(p2022_toks_cleaned, phrase("*-*"), concatenator ="")
toks_comp_p <- tokens_compound(p2022_tokens, phrase("*-*"), concatenator ="")


# get tokens containing the hyphen
toks_hyphenated_p <- grep("\\w+-\\w+", types(toks_comp_p), value = TRUE)

# replace the hyphenated tokens by versions without hyphen
p2022_toks_cleaned <- tokens_replace(toks_comp_p, toks_hyphenated_p, gsub("-", "", toks_hyphenated_p))

# convert to tokens
#c2022_toks_cleaned <- as.tokens(c2022_tokens)

#toks_comp <- tokens_compound(c2022_toks_cleaned, phrase("*-*"), concatenator ="")
toks_comp_c <- tokens_compound(c2022_tokens, phrase("*-*"), concatenator ="")

# get tokens containing the hyphen
toks_hyphenated_c <- grep("\\w+-\\w+", types(toks_comp_c), value = TRUE)

# replace the hyphenated tokens by versions without hyphen
c2022_toks_cleaned <- tokens_replace(toks_comp_c, toks_hyphenated_c, gsub("-", "", toks_hyphenated_c))

# merge tokens back into corpus object 
p2022_merged_toks <- corpus(sapply(p2022_toks_cleaned, paste, collapse = " "))
c2022_merged_toks <- corpus(sapply(c2022_toks_cleaned, paste, collapse = " "))

```

Now we can create sentence tokens for both corpora.
```{r}
# create "sentence" tokens for P2022 and C2022 corpus
p2022_sentences <- tokens(p2022_merged_toks, remove_punct = FALSE, remove_symbols = TRUE,
                  remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE, 
                  what = "sentence")

c2022_sentences <- tokens(c2022_merged_toks, remove_punct = FALSE, remove_symbols = TRUE,
                  remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE, 
                  what = "sentence")
```

## 2.2 Key Word In Context Retrieval
```{r}
# create a data frame from tokens containing 5 sentences before and after the keyword 

### DO FOR p2022 ####
kwic_pro_sent.df <- data.frame(matrix(ncol = 7, nrow = 0))
kwiclist_sent_pro <- list()

# for each compound word
for (word in compounds)
{
  # retrieve sentences before/after keyword 
  context_pro_sent <- kwic(p2022_sentences, word, valuetype="regex", window=5)
  kwiclist_sent_pro[[word]] <- context_pro_sent # save to list 
}

kwic_pro_sent.df = do.call(rbind, kwiclist_sent_pro) # save to final data frame 

### DO FOR c2022 ###
kwic_con_sent.df <- data.frame(matrix(ncol = 7, nrow = 0)) 
kwiclist_sent_con <- list()

# for each compound word
for (word in compounds)
{
  # retrieve sentences before/after keyword 
  context_con_sent <- kwic(c2022_sentences, word, valuetype="regex", window=5) 
  kwiclist_sent_con[[word]] <- context_con_sent # save to list
}

kwic_con_sent.df = do.call(rbind, kwiclist_sent_con) # save to final data frame 

kwic_pro_sent.df
kwic_con_sent.df
```

# 2.2 Export Concordances
```{r}
# save to csv file 
#write.csv(kwic_pro_sent.df,"../output/pro_context_new.csv", row.names = FALSE)
#write.csv(kwic_con_sent.df,"../output/con_context_new.csv", row.names = FALSE)
```

# 3. Term Frequencies
Additionally, we compute the term frequencies of each compound word and the according TF-IDF score, since both corpora have a different size and we want to explore the relevance of each term. 

Create a function to normalize the TF-IDF scores
```{r}
# min/max normalization from -1 to 1, relative to data frame results
normalize <- function(x, na.rm = TRUE){
  return((x - min(x)) / (max(x)-min(x)))}
```


```{r}
# use lemmatized tokens to re-create a corpus
# we need this step for the grouping of the frequencies 
p2022_cleaned <- corpus(sapply(sp_p2022_tokens, paste, collapse = " "))
c2022_cleaned <- corpus(sapply(sp_c2022_tokens, paste, collapse = " "))

# create groups
p2022_cleaned$group <- "activists"
c2022_cleaned$group <- "skeptics"

# create a corpus containing both subdiscourses
complete = p2022_cleaned+c2022_cleaned
```

```{r}
# create dfm with frequencies per group
dfm_complete_freq <- dfm(complete) %>% 
                       dfm_keep(pattern = compounds) %>% # only keep compound words
                       dfm_group(groups = group) # keep groups "activists" and "skeptics"

# convert dfm to data frame 
dfm_complete_df <- dfm_complete_freq %>% 
                  convert(to = "data.frame") %>%
                  t() # transpose data frame

set.seed(132) # set seed for reproducibility
textplot_wordcloud(dfm_complete_freq, comparison = TRUE, max_words = 250) # plot wordcloud

#write.csv(dfm_complete_df,"/Users/anna/Documents/uni/thesis/implementation/R/output/tf_complete.csv", row.names = TRUE)
```
And compute TF-IDF of the DFMs
```{r}
### FOR C2022
# create dfm of lemmatized tokens, only keep compound words
#dfm_c2022 <- dfm(sp_c2022_tokens) %>% dfm_keep(pattern = compounds)
#dfm_c2022_tfidf <- dfm_tfidf(dfm_c2022) # compute tfidf scores
#top_c2022_norm <- normalize(topfeatures(dfm_c2022, n=300)) # normalize scores

dfm_c2022 <- dfm(sp_c2022_tokens) %>% 
            dfm_tfidf() %>%
            dfm_keep(pattern = compounds) 

c2022_tfidf <- normalize(topfeatures(dfm_c2022, n=300))

  
 # dfm_keep(pattern = compounds)
#dfm_c2022_tfidf <- dfm_tfidf(dfm_c2022) # compute tfidf scores
#top_c2022_norm <- normalize(topfeatures(dfm_c2022, n=300))


# convert dfm into data frame
top_c2022_norm <- data.frame(Term = names(c2022_tfidf), Freq = c2022_tfidf, row.names = NULL) %>%
  dplyr::arrange(desc(Freq))

### FOR P2022
# create dfm of lemmatized tokens, only keep compound words
#dfm_p2022 <- dfm(sp_p2022_tokens) %>% dfm_keep(pattern = compounds) 
#dfm_p2022_tfidf <- dfm_tfidf(dfm_p2022) # compute tfidf scores
#top_p2022_norm <- normalize(topfeatures(dfm_p2022, n=300)) # normalize scores


dfm_p2022 <- dfm(sp_p2022_tokens) %>% 
            dfm_tfidf() %>%
            dfm_keep(pattern = compounds) 

p2022_tfidf <- normalize(topfeatures(dfm_p2022, n=300))

# convert dfm into data frame
top_p2022_norm <- data.frame(Term = names(p2022_tfidf), Freq = p2022_tfidf, row.names = NULL) %>%
  dplyr::arrange(desc(Freq))

# change column names to be able to merge both data frames
colnames(top_p2022_norm)[2] <- "Freq_P2022"
colnames(top_c2022_norm)[2] <- "Freq_C2022"

# merge data frames 
df_merge <- merge(top_c2022_norm,top_p2022_norm,by="Term", all.x = TRUE, all.y = TRUE)

# write to csv file
write.csv(df_merge,"/Users/anna/Documents/uni/thesis/implementation/R/output/tfidf_complete.csv", row.names = TRUE)
```


## Plot TF-IDF Scores
```{r}
# retrieve frequency table of dfm
freqs_pro <- textstat_frequency(dfm_p2022, force=TRUE)
freqs_con <- textstat_frequency(dfm_c2022, force=TRUE)

# capitalize first letter of compound
freqs_pro$feature <- str_to_title(freqs_pro$feature)
freqs_con$feature <- str_to_title(freqs_con$feature)

# apply normalization
freqs_pro$normalize = round(normalize(freqs_pro$frequency),3)
freqs_con$normalize = round(normalize(freqs_con$frequency),3)

# plot comparison of both groups
freqs.act <- filter(freqs_pro) %>% as.data.frame() %>% select(feature, normalize)
freqs.scept <- filter(freqs_con) %>% as.data.frame() %>% select(feature, normalize)
freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(50) %>% arrange(normalize.x) %>% mutate(feature = factor(feature, feature))

# create plot
plot8 <- ggplot(freqs) +
    geom_segment(aes(x=feature, xend=feature, y=normalize.x, yend=normalize.y), color="grey") +
    geom_point(aes(x=feature, y=normalize.x, colour="Activists"), size = 3) +
    geom_point(aes(x=feature, y=normalize.y, colour="Sceptics"), size = 3) +
    ggtitle("Comparison 'Klima' TF-IDF Scores per Group") + 
    xlab("") + ylab("TF-IDF") +
    coord_flip()

plot8+labs(colour="Group")

# save to png 
#ggsave("/Users/anna/Documents/uni/thesis/plots/comparison_tfidf.png", dpi=300, dev='png', height=10, width=15, units="in")
```
# TO DELETE!!!!
## Plot TF-IDF Scores
```{r}
# create a sample of the dfm with all words starting with "klima..." 
#klima_p2000 <- dfm_select(dfm_p2000_lemma, pattern="klima*")
#klima_c2000 <- dfm_select(dfm_c2000_lemma, pattern="klima*")

#####
#c2022_dfm <- dfm(sp_c2022_tokens) 
#c2022_tfidf <- dfm_tfidf(c2022_dfm)

#p2022_dfm <- dfm(sp_p2022_tokens) 
#p2022_tfidf <- dfm_tfidf(p2022_dfm)
######


#dfm_c2022 <- dfm(sp_c2022_tokens) %>% dfm_keep(pattern = compounds)
#dfm_c2022_tfidf <- dfm_tfidf(dfm_c2022) # compute tfidf scores
#top_c2022_norm <- normalize(topfeatures(dfm_c2022, n=300))

# calculate tfidf for "klima" words
#p2000_tfidf <- dfm_tfidf(klima_p2000, scheme_tf = "prop", scheme_df = "inverse")
#c2000_tfidf <- dfm_tfidf(klima_c2000, scheme_tf = "prop", scheme_df = "inverse")

# retrieve frequencies for "klima" words
freqs_pro <- textstat_frequency(p2022_tfidf, force=TRUE)
freqs_con <- textstat_frequency(c2022_tfidf, force=TRUE)

# apply normalization
#freqs_pro$normalize = round(normalize(freqs_pro$frequency), 3)
#freqs_con$normalize = round(normalize(freqs_con$frequency), 3)

# retrieve only words that are contained in our final compound list 
freqs_pro_subset <- freqs_pro[freqs_pro$feature %in% compounds, ]
freqs_pro_subset$feature <- str_to_title(freqs_pro_subset$feature)

freqs_con_subset <- freqs_con[freqs_con$feature %in% compounds, ]
freqs_con_subset$feature <- str_to_title(freqs_con_subset$feature)


freqs_pro_subset$normalize = round(normalize(freqs_pro_subset$frequency), 3)
freqs_con_subset$normalize = round(normalize(freqs_con_subset$frequency), 3)

# plot comparison of both groups -> only words from compound list 
freqs.act <- filter(freqs_pro_subset) %>% as.data.frame() %>% select(feature, normalize)
freqs.scept <- filter(freqs_con_subset) %>% as.data.frame() %>% select(feature, normalize)
freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(30) %>% arrange(normalize.x) %>% mutate(feature = factor(feature, feature))

# create plot
plot8 <- ggplot(freqs) +
    geom_segment(aes(x=feature, xend=feature, y=normalize.x, yend=normalize.y), color="grey") +
    geom_point(aes(x=feature, y=normalize.x, colour="Activists"), size = 3) +
    geom_point(aes(x=feature, y=normalize.y, colour="Sceptics"), size = 3) +
    ggtitle("Comparison 'Klima' TF-IDF Scores per Group") + 
    xlab("") + ylab("TF-IDF") +
    coord_flip()

plot8+labs(colour="Group")

ggsave("/Users/anna/Documents/uni/thesis/plots/comparison_klima_freqs_lemma.png", dpi=300, dev='png', height=6, width=12, units="in")
```



# TO DELETE


```{r}
kwic(c2022_merged_toks, pattern="klimaanbeter", window=1, valuetype="regex")
kwic(c2022_sentences, pattern="klimabank", window=1, valuetype="regex")

```


## 1.2 Apply to all Glossary Terms
Now, we will retrieve the collocations for each compound word. The collocations are then saved to a data frame, one for each corpus, and exported to a csv file, such that we can also use the data in Python
```{r}
### C2022
# initiate empty data frame for C2022
collocations_con = data.frame(docname=character(),
                 from=integer(),
                 to=integer(),
                 pre=logical(),
                 keyword=character(),
                 post=character(),
                 pattern=factor())

# for each compound word 
for (word in compounds){
  # initiate empty data frame
  colls = data.frame()
  # look up collocations
  colls <- kwic(sp_c2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
  as_tibble()
  # save to data frame 
  collocations_con <- rbind(collocations_con, colls)}

### P2022
# initiate empty data frame for P2022
collocations_pro = data.frame(docname=character(),
                 from=integer(),
                 to=integer(),
                 pre=logical(),
                 keyword=character(),
                 post=character(),
                 pattern=factor())

# for each compound
for (word in compounds){
  # initiate empty data frame
  colls = data.frame()
  # look up collocations
  colls <- kwic(sp_p2022_tokens, pattern=word, window=1, valuetype="fixed") %>%
  as_tibble()
  # save to data frame 
  collocations_pro <- rbind(collocations_pro, colls)}
```

Please run the following lines to save the output to a csv file. 
```{r}
#write.csv(collocations_con, "../output/collocations_con.csv")
#write.csv(collocations_pro, "../output/collocations_pro.csv")
```



### TO REPLACE COMPOUND FORMS BY THEIR LEMMA
```{r}
# for each compound

sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimaglaubenslehr", replacement="klimaglaubenslehre", valuetype = "fixed")
sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimakarawan", replacement="klimakarawane", valuetype = "fixed")
sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimazeug", replacement="klimazeugs", valuetype = "fixed")
sp_c2022_tokens <- tokens_replace(sp_c2022_tokens, pattern="klimawendehal", replacement="klimawendehals", valuetype = "fixed")


sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimaglaubenslehr", replacement="klimaglaubenslehre", valuetype = "fixed")
sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimakarawan", replacement="klimakarawane", valuetype = "fixed")
sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimazeug", replacement="klimazeugs", valuetype = "fixed")
sp_p2022_tokens <- tokens_replace(sp_p2022_tokens, pattern="klimawendehal", replacement="klimawendehals", valuetype = "fixed")




for (word_form in ["glaubenslehre"]){
  word = c(unlist_forms(word_form)) # turn into correct format
  original <- compound_df[compound_df$compound_forms %like% word[[1]], ]$original[[1]]
  lemma <- rep(original, length(word))

  # replace string in tokens with lemma form (for pro2000 and contra2000)
  pro2000_tokens <- tokens_replace(pro2000_tokens, word, lemma, valuetype = "fixed")
  contra2000_tokens <- tokens_replace(contra2000_tokens, word, lemma, valuetype = "fixed")}
```


```{r}
# function to preprocess compounds data frame 
# this function unnests the list of word forms for each compound and creates a list containing all potential word forms 
unlist_forms = function(word){
  x <- unlist(strsplit(word, ","))
  return(gsub(" ","",x))}

# apply the function to our compounds data frame
compound_forms <- unlist_forms(compound_df$compound_forms)



#for (word in compound_forms){
 # print(word)
#}

compound_forms
```

